############################################################
# Summary: This script trains multiple SVM models with hyperparameter optimization, 
# saves the best-performing models, and generates predictions for the full dataset.
#
# Note: Documentation and helper functions were drafted with the assistance of ChatGPT 
# due to the extensive amount of code required. However, I have personally verified 
# all major steps in the analysis and take full responsibility for every procedure 
# carried out in this script.

source("code/helper_analysis.R")


# ---------------------------------
# 1) Load Labeled Data (4 Datasets)
# ---------------------------------
df_strong_unbalanced <- read_csv("data/labaled_data/labeled_unbalanced_sample_strong.csv")
df_combined_unbalanced   <- read_csv("data/labaled_data/labeled_unbalanced_sample_combined.csv")
df_strong_balanced   <- read_csv("data/labaled_data/labeled_balanced_sample_strong.csv")
df_combined_balanced     <- read_csv("data/labaled_data/labeled_balanced_sample_combined.csv")

# ---------------------------
# Core training (no saving)
# ---------------------------

levels_labels <- c("emp","dur","none")
seeds_more <- c(42, 123, 999)

# Your grids:
kernels_to_run <- c("svmLinear")
C_grid   <- 2^seq(-5, 5, by = 1)

# Strong unbalanced
res_strong_unbalanced <- train_and_evaluate_model_multiple_seeds_kernels(
  df = df_strong_unbalanced,
  label_column = "labels",
  label_levels = levels_labels,
  seeds = seeds_more,
  kernels = kernels_to_run,
  C_grid = C_grid,
  sigma_grid = sigma_grid,
  save_models = T,
  dataset_name = "strong_unbalanced"
)

# Strong balanced
res_strong_balanced <- train_and_evaluate_model_multiple_seeds_kernels(
  df = df_strong_balanced,
  label_column = "labels",
  label_levels = levels_labels,
  seeds = seeds_more,
  kernels = kernels_to_run,
  C_grid = C_grid,
  sigma_grid = sigma_grid,
  save_models = T,
  dataset_name = "strong_balanced"
)

# Combined unbalanced
res_combined_unbalanced <- train_and_evaluate_model_multiple_seeds_kernels(
  df = df_combined_unbalanced,
  label_column = "labels",
  label_levels = levels_labels,
  seeds = seeds_more,
  kernels = kernels_to_run,
  C_grid = C_grid,
  sigma_grid = sigma_grid,
  save_models = T,
  dataset_name = "combined_unbalanced"

)

# Combined balanced
res_combined_balanced <- train_and_evaluate_model_multiple_seeds_kernels(
  df = df_combined_balanced,
  label_column = "labels",
  label_levels = levels_labels,
  seeds = seeds_more,
  kernels = kernels_to_run,
  C_grid = C_grid,
  sigma_grid = sigma_grid,
  save_models = T,
  dataset_name = "combined_balanced"
)

# Build one tidy table
df_all_results <- dplyr::bind_rows(
  make_result_df(res_strong_unbalanced,   "strong_unbalanced"),
  make_result_df(res_strong_balanced,     "strong_balanced"),
  make_result_df(res_combined_unbalanced, "combined_unbalanced"),
  make_result_df(res_combined_balanced,   "combined_balanced")
)

print(df_all_results)

# If you want to quickly see the final best hyperparameters per dataset × kernel:
final_best <- list(
  strong_unbalanced   = attr(make_result_df(res_strong_unbalanced,   "tmp"), "final_best"),
  strong_balanced     = attr(make_result_df(res_strong_balanced,     "tmp"), "final_best"),
  combined_unbalanced = attr(make_result_df(res_combined_unbalanced, "tmp"), "final_best"),
  combined_balanced   = attr(make_result_df(res_combined_balanced,   "tmp"), "final_best")
)

final_best  # shows best C for linear; best (sigma, C) for radial, chosen by highest avg test macro-F1 across seeds


################################################################################
################################################################################
################################################################################


# flatten final_best list into a df
final_best_df <- dplyr::bind_rows(
  lapply(names(final_best), function(ds) {
    # each dataset has sublists like $svmLinear and/or $svmRadial
    lapply(names(final_best[[ds]]), function(km) {
      df <- as.data.frame(final_best[[ds]][[km]])
      df$dataset <- ds
      df$kernel  <- km
      df
    }) |> dplyr::bind_rows()
  })
)


# reorder columns
final_best_df <- final_best_df |>
  dplyr::select(dataset, kernel, dplyr::everything())

print(final_best_df)

write_csv(df_all_results, "data/results/classification_results_svm_final.csv")

